In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from process import read_data
from process import process_hours_columns

import plotly.graph_objects as go
In [2]:
years = list(range(2012, 2021))
folder_path = '../data/overtime/'
file_name = 'Details.xlsx'
encoding = 'ISO-8859-1'
target_col = []
drop_col = ['Street', 'xStreet', 'Customer No.', 'Customer', 'Customer Address', 'Customer Address 1', 'Customer Address 3', 'Customer_City', 'Customer_State', 'Customer_zip']
# Read all the data
dfs = read_data(folder_path, file_name, years, encoding, target_col, drop_col, file_type='xlsx')

# # test
# years = list(range(2012, 2013))
# dfs2 = read_data(folder_path, file_name, years, encoding, target_col, drop_col, file_type='xlsx')
Read data from: ../data/overtime/2012_Details.xlsx
Read data from: ../data/overtime/2013_Details.xlsx
Read data from: ../data/overtime/2014_Details.xlsx
Read data from: ../data/overtime/2015_Details.xlsx
Read data from: ../data/overtime/2016_Details.xlsx
Read data from: ../data/overtime/2017_Details.xlsx
Read data from: ../data/overtime/2018_Details.xlsx
Read data from: ../data/overtime/2019_Details.xlsx
Read data from: ../data/overtime/2020_Details.xlsx
Successfully read data from all files.
In [3]:
process_hours_columns(dfs, 'Hours\nWorked')

df = pd.concat(dfs, ignore_index=True)
print(df.sample(5))

print(df.columns.tolist())
        Job No.   Emp. ID            Employee  Rank           Location  \
68067   33667.0  102373.0  MYLETT,CHRISTOPHER     9        25 UNION ST   
407571      NaN   10728.0    MILIEN,ROCHEFORT     9                NaN   
180503  10768.0   11710.0    LOCKHEAD,KEVIN F     9      BENNINGTON ST   
49669   92020.0    9704.0  BLICKER,CHARLES R.     9  368 DORCHESTER ST   
679338      NaN  106695.0    SAINTFORT,FRANTZ     9                NaN   

               Detail\nDate  Start\nTime  End \nTime  Hours\nWorked  \
68067   2012-10-12 00:00:00         2345         215            2.5   
407571  2015-06-04 00:00:00          700        1530            8.5   
180503  2013-07-15 00:00:00         1000        1500            5.0   
49669   2012-07-19 00:00:00         1600        1700            1.0   
679338  2017-10-10 00:00:00          730        1530            8.0   

        Hours\nPaid Type          Address             City/State/Zip  YEAR  \
68067             4    S     25 UNION ST.           BOSTON, MA 02108  2012   
407571            9    Z              NaN                        NaN  2015   
180503            8    Z  P.O. BOX 220801  DORCHESTER, MA 02122-0004  2013   
49669             4    Z  P.O. BOX 220801  DORCHESTER, MA 02122-0004  2012   
679338            8    C              NaN                        NaN  2017   

        Tracking_No          c  
68067           NaN        NaN  
407571    1395141.0        NaN  
180503          NaN        NaN  
49669           NaN        NaN  
679338          NaN  1803973.0  
['Job No.', 'Emp. ID', 'Employee', 'Rank', 'Location', 'Detail\nDate', 'Start\nTime', 'End \nTime', 'Hours\nWorked', 'Hours\nPaid', 'Type', 'Address', 'City/State/Zip', 'YEAR', 'Tracking_No', 'c']
In [4]:
print(df.dtypes)
Job No.           float64
Emp. ID           float64
Employee           object
Rank                int64
Location           object
Detail\nDate       object
Start\nTime         int64
End \nTime          int64
Hours\nWorked     float64
Hours\nPaid         int64
Type               object
Address            object
City/State/Zip     object
YEAR                int64
Tracking_No       float64
c                 float64
dtype: object
In [5]:
# Step 1: Compute the ratio
df['Ratio'] = df['Hours\nWorked'] / df['Hours\nPaid']

# Step 2: Group by Rank and calculate the mean ratio
rank_avg = df.groupby('Rank')['Ratio'].mean().reset_index()

# (Optional) Sort by average ratio for cleaner display
rank_avg = rank_avg.sort_values('Ratio', ascending=False)

# Step 3: Plot one bar per Rank
fig = go.Figure()

# Add a single bar trace
fig.add_trace(go.Bar(
    x=rank_avg['Rank'],          # X-axis: Rank
    y=rank_avg['Ratio'],          # Y-axis: Average Ratio
    marker=dict(
        color=rank_avg['Ratio'],  # Use Ratio value to shade the bars
        colorscale='Viridis'      # Viridis color scale
    )
))

# Update layout properly
fig.update_layout(
    title='Average Worked-to-Paid Overtime Ratio by Rank', # Title
    xaxis_title='Rank',                                    # X-axis label
    yaxis_title='Average WRKDHRS / OTHOURS Ratio',         # Y-axis label
    xaxis_tickangle=0,                                     # Keep x labels straight
    plot_bgcolor='white',                                  # White background
    margin=dict(l=20, r=20, t=80, b=80),
    yaxis=dict(
        range=[0, 1.1],     # y-axis from 0 to 1.1
        showgrid=True,      # THIS is how you enable grid lines
        gridcolor='lightgrey' # Optional: make the grid lines light grey
    ),
)

# Show the figure
fig.show()
In [6]:
df.sample(5)[['Employee', 'Ratio', 'YEAR']]
Out[6]:
Employee Ratio YEAR
139542 HOBIN,JOSEPH 0.944444 2013
744750 BUTLER,BOBBIE 0.812500 2018
234038 SPILLANE,MATTHEW 1.000000 2013
879554 CONLEY,KENNETH M 1.000000 2019
952487 GARCIA,ELVIS 0.562500 2020
In [7]:
# Create the figure
fig = go.Figure()

# Create figure
fig = go.Figure()

# Add histogram with fine bins
fig.add_trace(go.Histogram(
    x=df["Ratio"],
    xbins=dict(
        start=0,
        end=1,
        size=0.1  
    ),
    marker_color='steelblue',
    marker_line_color='black',
    marker_line_width=1
))

# Update layout
fig.update_layout(
    title="Distribution of Ratios of Overtime Paid Hours / Worked Hours (2012–2021)",
    xaxis_title="Ratio of Overtime Paid Hours / Worked Hours",
    yaxis_title="Frequency",
    plot_bgcolor='white',
    bargap=0.2,
    margin=dict(l=20, r=20, t=80, b=80),
    yaxis=dict(showgrid=True, gridcolor='lightgrey'),
    xaxis=dict(range=[0, 1])
)

# Show figure
fig.show()
In [8]:
def calculate_ratio(df):
    total_wrkdhrs = df['Hours\nWorked'].sum()
    total_othours = df['Hours\nPaid'].sum()
    return total_wrkdhrs / total_othours if total_othours else 0



# Calculate the ratio for each year
years = []
ratios = []
for year, small_df in df.groupby('YEAR'):
    years.append(year)
    ratios.append(calculate_ratio(small_df))


# Create the figure
fig = go.Figure()

# Add line+marker trace
fig.add_trace(go.Scatter(
    x=years,
    y=ratios,
    mode='lines+markers',
    marker=dict(color='blue'),
    line=dict(color='blue'),
    name='Ratio Over Years'
))

# Update layout
fig.update_layout(
    title='Changes in Ratio Over the Years',
    xaxis_title='Year',
    yaxis_title='Ratio (Worked Hours / Overtime Paid Hours)',
    plot_bgcolor='white',
    margin=dict(l=20, r=20, t=80, b=80),
    yaxis=dict(showgrid=True, gridcolor='lightgrey'),
    xaxis=dict(tickmode='linear')
)

# Show figure
fig.show()
In [9]:
# Filter data for YEAR == 2020
df_2020 = df[df['YEAR'] == 2020]

# Calculate Ratio safely
df_2020['Ratio'] = df_2020.apply(
    lambda row: row['Hours\nWorked'] / row['Hours\nPaid'] if row['Hours\nPaid'] != 0 else None, axis=1
)

# Drop missing ratios
ratios_2020 = df_2020['Ratio'].dropna()

# Calculate Q1, Median, Q3 manually
q1 = np.percentile(ratios_2020, 25)
median = np.percentile(ratios_2020, 50)
q3 = np.percentile(ratios_2020, 75)

# Create box plot
fig = go.Figure()


fig.add_trace(go.Box(
    x=ratios_2020,
    boxpoints='outliers',
    marker_color='lightgreen',
    line_color='black',
    orientation='h'
))


label_x = max(ratios_2020) + 0.2  

# Add aligned annotations
fig.add_annotation(x=label_x, y=-0.2, text=f"Q1: {q1:.2f}", showarrow=False, font=dict(size=12), align='left')
fig.add_annotation(x=label_x, y=0, text=f"Median: {median:.2f}", showarrow=False, font=dict(size=12), align='left')
fig.add_annotation(x=label_x, y=0.2, text=f"Q3: {q3:.2f}", showarrow=False, font=dict(size=12), align='left')

# Update layout
fig.update_layout(
    title='Overtime Ratio Distribution (Worked / Paid) - 2020',
    xaxis_title='WRKDHRS / OTHOURS',
    plot_bgcolor='white',
    margin=dict(l=20, r=20, t=80, b=80),
    yaxis=dict(
        showgrid=True,
        zeroline=False,
        showticklabels=False
    )
)

# Show figure
fig.show()
/var/folders/7y/3cxkc6t534dg2mbwhpb4mbf80000gn/T/ipykernel_43571/640165696.py:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [10]:
iqr = q3 - q1

# Define fences
lower_fence = q1 - 1.5 * iqr
upper_fence = q3 + 1.5 * iqr
# Identify outliers
outlier_rows = df_2020[df_2020['Ratio'] < 0.26]
# # Filter outliers

# Print only ID and Ratio
print("Outlier records :")
# print(outliers)
print(outlier_rows[['Emp. ID', 'Ratio']])
Outlier records :
         Emp. ID  Ratio
890663  103524.0   0.25
890742  108895.0   0.25
890811    9757.0   0.25
890923    9365.0   0.25
890947  126474.0   0.25
...          ...    ...
958645   97369.0   0.25
958654    9760.0   0.25
958711   10813.0   0.25
958749  106709.0   0.25
958755    9031.0   0.25

[1242 rows x 2 columns]
In [11]:
import plotly.graph_objects as go
import pandas as pd

# Group by Location and calculate mean overtime ratio
location_grouped = df.groupby('Location')['Ratio'].mean()

# Sort by ratio descending
location_grouped = location_grouped.sort_values(ascending=False)

# Filter locations with reasonable Ratio (e.g., less than 0.8)
filtered_locations = location_grouped[location_grouped < 0.8]

# Handle case: if too few left, pick highest still
if len(filtered_locations) < 10:
    filtered_locations = location_grouped.head(15)

# Compute 'Other' mean
other_mean = location_grouped[~location_grouped.index.isin(filtered_locations.index)].mean()

# Combine filtered locations and 'Other'
final_locations = pd.concat([filtered_locations, pd.Series({'Other': other_mean})])

# Create the figure
fig = go.Figure()

# Add a vertical bar chart trace
fig.add_trace(go.Bar(
    x=final_locations.index,           # Locations on the X-axis
    y=final_locations.values,          # Average ratio on the Y-axis
    marker=dict(
        color=final_locations.values,  # Color bars based on ratio values
        colorscale='Blues',             # Use a blue color scale
        colorbar=dict(title='Ratio')    # Add a color bar with title
    )
))

# Update layout
fig.update_layout(
    title='Overtime Pay Ratio by Location (Filtered + Other)',
    xaxis_title='Location',
    yaxis_title='WRKDHRS / OTHOURS',
    height=600,
    width=1200,
    template='simple_white',
    xaxis_tickangle=-45
)

# Show the figure
fig.show()